{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here I will to show how to use bayes on multi-class classification/discrimination\n",
    "\n",
    "import class sklearn.naive_bayes.MultinomialNB for Multinomial logistic regression (logistic regression of multi-class)\n",
    "\n",
    "But if you want to classify binary/boolean class, it is better to use BernoulliNB "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I will use also compare accuracy for using BOW, TF-IDF, and HASHING for vectorizing technique"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "# to get f1 score\n",
    "from sklearn import metrics\n",
    "import numpy as np\n",
    "import sklearn.datasets\n",
    "import re\n",
    "from sklearn.feature_extraction.text import HashingVectorizer\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.cross_validation import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define some function to help us for preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# clear string\n",
    "def clearstring(string):\n",
    "    string = re.sub('[^\\'\\\"A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = ' '.join(string)\n",
    "    return string\n",
    "\n",
    "# because of sklean.datasets read a document as a single element\n",
    "# so we want to split based on new line\n",
    "def separate_dataset(trainset):\n",
    "    datastring = []\n",
    "    datatarget = []\n",
    "    for i in range(len(trainset.data)):\n",
    "        data_ = trainset.data[i].split('\\n')\n",
    "        # python3, if python2, just remove list()\n",
    "        data_ = list(filter(None, data_))\n",
    "        for n in range(len(data_)):\n",
    "            data_[n] = clearstring(data_[n])\n",
    "        datastring += data_\n",
    "        for n in range(len(data_)):\n",
    "            datatarget.append(trainset.target[i])\n",
    "    return datastring, datatarget"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']\n",
      "416809\n",
      "416809\n"
     ]
    }
   ],
   "source": [
    "# you can change any encoding type\n",
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset)\n",
    "print (trainset.target_names)\n",
    "print (len(trainset.data))\n",
    "print (len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
      "  \" in version 0.21.\", DeprecationWarning)\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/feature_extraction/hashing.py:94: DeprecationWarning: the option non_negative=True has been deprecated in 0.19 and will be removed in version 0.21.\n",
      "  \" in version 0.21.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "# bag-of-word\n",
    "bow = CountVectorizer().fit_transform(trainset.data)\n",
    "\n",
    "#tf-idf, must get from BOW first\n",
    "tfidf = TfidfTransformer().fit_transform(bow)\n",
    "\n",
    "#hashing, default n_features, probability cannot divide by negative\n",
    "hashing = HashingVectorizer(non_negative = True).fit_transform(trainset.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feed Naive Bayes using BOW\n",
    "\n",
    "but split it first into train-set (80% of our data-set), and validation-set (20% of our data-set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.859072479067\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.90      0.84      0.87     11464\n",
      "       fear       0.84      0.81      0.82      9455\n",
      "        joy       0.85      0.93      0.89     28246\n",
      "       love       0.82      0.61      0.70      6920\n",
      "    sadness       0.87      0.94      0.91     24263\n",
      "   surprise       0.84      0.34      0.49      3014\n",
      "\n",
      "avg / total       0.86      0.86      0.85     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(bow, trainset.target, test_size = 0.2)\n",
    "\n",
    "bayes_multinomial = MultinomialNB().fit(train_X, train_Y)\n",
    "predicted = bayes_multinomial.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feed Naive Bayes using TF-IDF\n",
    "\n",
    "but split it first into train-set (80% of our data-set), and validation-set (20% of our data-set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.734855209808\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.93      0.54      0.69     11336\n",
      "       fear       0.91      0.37      0.53      9603\n",
      "        joy       0.68      0.98      0.80     28062\n",
      "       love       0.96      0.16      0.27      7085\n",
      "    sadness       0.74      0.94      0.83     24278\n",
      "   surprise       0.94      0.04      0.08      2998\n",
      "\n",
      "avg / total       0.79      0.73      0.69     83362\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(tfidf, trainset.target, test_size = 0.2)\n",
    "\n",
    "bayes_multinomial = MultinomialNB().fit(train_X, train_Y)\n",
    "predicted = bayes_multinomial.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Feed Naive Bayes using hashing\n",
    "\n",
    "but split it first into train-set (80% of our data-set), and validation-set (20% of our data-set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy validation set:  0.578524987404\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "      anger       0.93      0.07      0.12     11449\n",
      "       fear       0.96      0.03      0.05      9533\n",
      "        joy       0.49      1.00      0.66     28047\n",
      "       love       1.00      0.00      0.01      6967\n",
      "    sadness       0.76      0.79      0.78     24408\n",
      "   surprise       0.00      0.00      0.00      2958\n",
      "\n",
      "avg / total       0.71      0.58      0.47     83362\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "train_X, test_X, train_Y, test_Y = train_test_split(hashing, trainset.target, test_size = 0.2)\n",
    "\n",
    "bayes_multinomial = MultinomialNB().fit(train_X, train_Y)\n",
    "predicted = bayes_multinomial.predict(test_X)\n",
    "print('accuracy validation set: ', np.mean(predicted == test_Y))\n",
    "\n",
    "# print scores\n",
    "print(metrics.classification_report(test_Y, predicted, target_names = trainset.target_names))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
